Create a connection to your MySQL database using the pymysql library.
import pandas as pd
import os
os.makedirs('Data/',exist_ok=True)
os.listdir('Data')
['title_basics.csv.gz']
A certain amount of transformation will be needed when converting a Raw dataset into a digestable MySQL database, below will show an example of formatting/ transforming our rows into earier to read tables for MySQL to save.
Normalize Genre:
Convert the single string of genres from title basics into 2 new tables.
title_genres: with the columns:
genres:
# read in your csv
basics = pd.read_csv('Data/title_basics.csv.gz', low_memory=False)
# drop columns that are deemed unimportant by the stakeholder
basics=basics.drop(columns=["originalTitle","isAdult","titleType","endYear"])
basics.head()
tconst | primaryTitle | startYear | runtimeMinutes | genres | |
---|---|---|---|---|---|
0 | tt0035423 | Kate & Leopold | 2001.0 | 118 | Comedy,Fantasy,Romance |
1 | tt0062336 | The Tango of the Widower and Its Distorting Mi... | 2020.0 | 70 | Drama |
2 | tt0068865 | Lives of Performers | 2016.0 | 90 | Drama |
3 | tt0069049 | The Other Side of the Wind | 2018.0 | 122 | Drama |
4 | tt0088751 | The Naked Monster | 2005.0 | 100 | Comedy,Horror,Sci-Fi |
# create individual rows, each with a single genre
basics['genres_split'] = basics['genres'].str.split(',')
exploded_genres = basics.explode('genres_split')
exploded_genres
tconst | primaryTitle | startYear | runtimeMinutes | genres | genres_split | |
---|---|---|---|---|---|---|
0 | tt0035423 | Kate & Leopold | 2001.0 | 118 | Comedy,Fantasy,Romance | Comedy |
0 | tt0035423 | Kate & Leopold | 2001.0 | 118 | Comedy,Fantasy,Romance | Fantasy |
0 | tt0035423 | Kate & Leopold | 2001.0 | 118 | Comedy,Fantasy,Romance | Romance |
1 | tt0062336 | The Tango of the Widower and Its Distorting Mi... | 2020.0 | 70 | Drama | Drama |
2 | tt0068865 | Lives of Performers | 2016.0 | 90 | Drama | Drama |
... | ... | ... | ... | ... | ... | ... |
82003 | tt9916190 | Safeguard | 2020.0 | 95 | Action,Adventure,Thriller | Action |
82003 | tt9916190 | Safeguard | 2020.0 | 95 | Action,Adventure,Thriller | Adventure |
82003 | tt9916190 | Safeguard | 2020.0 | 95 | Action,Adventure,Thriller | Thriller |
82004 | tt9916362 | Coven | 2020.0 | 92 | Drama,History | Drama |
82004 | tt9916362 | Coven | 2020.0 | 92 | Drama,History | History |
153827 rows × 6 columns
# sanitycheck to make sure the explode worked
unique_genres = sorted(exploded_genres['genres_split'].unique())
unique_genres
['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']
title_genres=exploded_genres[['tconst','genres_split']].copy()
title_genres.head()
tconst | genres_split | |
---|---|---|
0 | tt0035423 | Comedy |
0 | tt0035423 | Fantasy |
0 | tt0035423 | Romance |
1 | tt0062336 | Drama |
2 | tt0068865 | Drama |
genre_id_map=dict(zip(unique_genres,range(len(unique_genres))))
genre_id_map
{'Action': 0, 'Adult': 1, 'Adventure': 2, 'Animation': 3, 'Biography': 4, 'Comedy': 5, 'Crime': 6, 'Drama': 7, 'Family': 8, 'Fantasy': 9, 'Game-Show': 10, 'History': 11, 'Horror': 12, 'Music': 13, 'Musical': 14, 'Mystery': 15, 'News': 16, 'Reality-TV': 17, 'Romance': 18, 'Sci-Fi': 19, 'Sport': 20, 'Talk-Show': 21, 'Thriller': 22, 'War': 23, 'Western': 24}
title_genres['genre_id'] = title_genres['genres_split'].map(genre_id_map)
title_genres=title_genres.drop(columns='genres_split')
genre_map=pd.DataFrame({'genre_name':genre_id_map.keys(),
'genre_id':genre_id_map.values()})
genre_map.head()
genre_name | genre_id | |
---|---|---|
0 | Action | 0 |
1 | Adult | 1 |
2 | Adventure | 2 |
3 | Animation | 3 |
4 | Biography | 4 |
#col no longer needed for loading into sql after transformations
basics=basics.drop("genres_split",axis=1)
# verify all columns are how you would like them to be exported into MySQL
basics.head()
tconst | primaryTitle | startYear | runtimeMinutes | genres | |
---|---|---|---|---|---|
0 | tt0035423 | Kate & Leopold | 2001.0 | 118 | Comedy,Fantasy,Romance |
1 | tt0062336 | The Tango of the Widower and Its Distorting Mi... | 2020.0 | 70 | Drama |
2 | tt0068865 | Lives of Performers | 2016.0 | 90 | Drama |
3 | tt0069049 | The Other Side of the Wind | 2018.0 | 122 | Drama |
4 | tt0088751 | The Naked Monster | 2005.0 | 100 | Comedy,Horror,Sci-Fi |
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
#from sqlalchemy_utils import create_database, database_exists
connection_str = "mysql+pymysql://root:root@localhost/Movies"
engine = create_engine(connection_str)
title_genres.to_sql('title_genres',engine,if_exists='replace',index=False)
q = """SELECT *FROM title_genres LIMIT 5;"""
pd.read_sql(q, engine)
tconst | genre_id | |
---|---|---|
0 | tt0035423 | 5 |
1 | tt0035423 | 9 |
2 | tt0035423 | 18 |
3 | tt0062336 | 7 |
4 | tt0068865 | 7 |
genre_map.to_sql('genres',engine,if_exists='replace',index=False)
q = """SELECT *FROM genres LIMIT 5;"""
pd.read_sql(q, engine)
genre_name | genre_id | |
---|---|---|
0 | Action | 0 |
1 | Adult | 1 |
2 | Adventure | 2 |
3 | Animation | 3 |
4 | Biography | 4 |
basics.dtypes
tconst object primaryTitle object startYear float64 runtimeMinutes int64 genres object dtype: object
from sqlalchemy.types import *
key_len = basics['tconst'].fillna('').map(len).max()
title_len = basics['primaryTitle'].fillna('').map(len).max()
gen_len= basics['genres'].fillna('').map(len).max()
df_schema_basics = {
"tconst": String(key_len+1),
"primaryTitle": Text(title_len+1),
"startYear":Float(),
"runtimeMinutes":Integer(),
"genres":Text(gen_len+1)}
basics.to_sql('title_basics',engine,dtype=df_schema_basics,if_exists='replace',index=False)
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1adb809d310>
q = """SELECT *FROM title_basics LIMIT 5;"""
pd.read_sql(q, engine)
tconst | primaryTitle | startYear | runtimeMinutes | genres | |
---|---|---|---|---|---|
0 | tt0035423 | Kate & Leopold | 2001.0 | 118 | Comedy,Fantasy,Romance |
1 | tt0062336 | The Tango of the Widower and Its Distorting Mi... | 2020.0 | 70 | Drama |
2 | tt0068865 | Lives of Performers | 2016.0 | 90 | Drama |
3 | tt0069049 | The Other Side of the Wind | 2018.0 | 122 | Drama |
4 | tt0088751 | The Naked Monster | 2005.0 | 100 | Comedy,Horror,Sci-Fi |